# Load required libraries
library(tidyverse)
library(tidytext)
library(tm)
library(SnowballC)
library(Rtsne)
library(ggplot2)
library(plotly)
library(scales)
library(wordcloud)
library(viridis)
library(DT)
# Set random seed for reproducibility
set.seed(42)
# Read data (update path as needed)
df <- read.csv("https://raw.githubusercontent.com/JIHONGKING/Data_Analysis/refs/heads/main/filtered_amazon_reviews.csv", stringsAsFactors = FALSE)
# Clean and preprocess data
df_clean <- df %>%
filter(!is.na(Text),
Score %in% 1:5,
nchar(Text) > 50, # Filter out very short reviews
nchar(Text) < 1000) %>% # Filter out extremely long reviews
mutate(sentiment = case_when(
Score <= 2 ~ "Negative",
Score == 3 ~ "Neutral",
Score >= 4 ~ "Positive"
)) %>%
distinct(Text, .keep_all = TRUE) # Remove duplicates
# Display sentiment distribution before sampling
sentiment_counts <- df_clean %>%
count(sentiment) %>%
mutate(percentage = n / sum(n) * 100)
# Create a balanced sample with 500 reviews from each sentiment category
# Function to sample with replacement if n > available data
sample_n_or_all <- function(df, n) {
if (nrow(df) >= n) {
return(sample_n(df, n))
} else {
return(sample_n(df, n, replace = TRUE))
}
}
# Stratified sampling for balanced sentiment representation
df_neg <- df_clean %>% filter(sentiment == "Negative") %>% sample_n_or_all(500)
df_neu <- df_clean %>% filter(sentiment == "Neutral") %>% sample_n_or_all(500)
df_pos <- df_clean %>% filter(sentiment == "Positive") %>% sample_n_or_all(500)
df_sample <- bind_rows(df_neg, df_neu, df_pos)
# Text preprocessing
reviews_tidy <- df_sample %>%
select(Id, Text, sentiment) %>%
unnest_tokens(word, Text) %>%
# Remove stop words
anti_join(stop_words) %>%
# Filter out short words (often not meaningful)
filter(nchar(word) > 2) %>%
# Apply stemming to group related words
mutate(word = wordStem(word)) %>%
count(Id, sentiment, word, sort = TRUE) %>%
ungroup()
# Calculate TF-IDF
tfidf <- reviews_tidy %>%
bind_tf_idf(word, Id, n)
# Create Document-Term Matrix
dtm <- tfidf %>%
cast_dtm(document = Id, term = word, value = tf_idf)
# Convert to matrix
dtm_mat <- as.matrix(dtm)
# Remove any zero-variance features
var_cols <- apply(dtm_mat, 2, var)
dtm_filtered <- dtm_mat[, var_cols > 0]
# Map back sentiment information
ids <- rownames(dtm_filtered)
sentiment_map <- df_sample %>%
select(Id, sentiment, Text, Score) %>%
mutate(Id = as.character(Id))
# Run t-SNE
set.seed(123) # For reproducibility
tsne_model <- Rtsne(dtm_filtered, dims = 2,
perplexity = 30, # Hyperparameter that balances local and global structure
max_iter = 1000, # More iterations for better convergence
verbose = TRUE)
## Performing PCA
## Read the 1500 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.12 seconds (sparsity = 0.105289)!
## Learning embedding...
## Iteration 50: error is 71.855611 (50 iterations in 0.11 seconds)
## Iteration 100: error is 70.894975 (50 iterations in 0.10 seconds)
## Iteration 150: error is 70.872609 (50 iterations in 0.09 seconds)
## Iteration 200: error is 70.872434 (50 iterations in 0.08 seconds)
## Iteration 250: error is 70.872518 (50 iterations in 0.08 seconds)
## Iteration 300: error is 1.887525 (50 iterations in 0.07 seconds)
## Iteration 350: error is 1.662712 (50 iterations in 0.08 seconds)
## Iteration 400: error is 1.597569 (50 iterations in 0.08 seconds)
## Iteration 450: error is 1.569047 (50 iterations in 0.08 seconds)
## Iteration 500: error is 1.551669 (50 iterations in 0.08 seconds)
## Iteration 550: error is 1.540649 (50 iterations in 0.08 seconds)
## Iteration 600: error is 1.533899 (50 iterations in 0.08 seconds)
## Iteration 650: error is 1.528895 (50 iterations in 0.08 seconds)
## Iteration 700: error is 1.525544 (50 iterations in 0.08 seconds)
## Iteration 750: error is 1.521983 (50 iterations in 0.08 seconds)
## Iteration 800: error is 1.517806 (50 iterations in 0.08 seconds)
## Iteration 850: error is 1.515856 (50 iterations in 0.07 seconds)
## Iteration 900: error is 1.513360 (50 iterations in 0.07 seconds)
## Iteration 950: error is 1.513138 (50 iterations in 0.06 seconds)
## Iteration 1000: error is 1.510683 (50 iterations in 0.06 seconds)
## Fitting performed in 1.57 seconds.
# Combine t-SNE results with metadata
df_tsne <- as.data.frame(tsne_model$Y) %>%
mutate(Id = ids) %>%
left_join(sentiment_map, by = "Id") %>%
# Create a shortened text version for tooltips
mutate(short_text = str_trunc(Text, 200))
# Create custom tooltip for interactive plot
tooltip_text <- paste0(
"Rating: ", df_tsne$Score, "<br>",
"Sentiment: ", df_tsne$sentiment, "<br>",
"Review: ", df_tsne$short_text
)
# Interactive visualization
p <- plot_ly(df_tsne,
x = ~V1,
y = ~V2,
type = "scatter",
mode = "markers",
color = ~sentiment,
colors = c("Negative" = "#E74C3C",
"Neutral" = "#7F8C8D",
"Positive" = "#2ECC71"),
text = tooltip_text,
hoverinfo = "text",
marker = list(
size = 8,
opacity = 0.7,
line = list(width = 1, color = "#FFFFFF")
)) %>%
layout(
title = list(
text = "Interactive t-SNE Map of Amazon Food Review Sentiments",
font = list(size = 20)
),
xaxis = list(
title = "t-SNE Dimension 1",
zeroline = FALSE,
showgrid = FALSE
),
yaxis = list(
title = "t-SNE Dimension 2",
zeroline = FALSE,
showgrid = FALSE
),
legend = list(
title = list(text = "Sentiment"),
x = 0.01,
y = 0.99,
bgcolor = "rgba(255, 255, 255, 0.7)"
),
margin = list(l = 50, r = 50, b = 100, t = 100),
annotations = list(
x = 0.5,
y = -0.1,
xref = "paper",
yref = "paper",
text = "Hover over points to see review details. Notice how sentiments cluster in the semantic space.",
showarrow = FALSE
)
)
# Static visualization 1: Top words by sentiment
# Calculate word importance per sentiment
word_importance <- reviews_tidy %>%
group_by(sentiment, word) %>%
summarize(
frequency = n(),
total_reviews = n_distinct(Id),
.groups = "drop"
) %>%
# Filter words that appear in at least 10 reviews
filter(frequency >= 10) %>%
# Calculate the word's distinctiveness for each sentiment
group_by(word) %>%
mutate(
total_freq = sum(frequency),
sentiment_ratio = frequency / total_freq
) %>%
ungroup() %>%
# Keep words that are highly associated with a sentiment
filter(sentiment_ratio > 0.6)
# Get top words for each sentiment
top_words <- word_importance %>%
group_by(sentiment) %>%
top_n(15, frequency) %>%
ungroup() %>%
mutate(word = reorder_within(word, frequency, sentiment))
# Static visualization 2: Sentiment distribution by rating
rating_sentiment_plot <- df_sample %>%
count(Score, sentiment) %>%
group_by(Score) %>%
mutate(percentage = n / sum(n) * 100) %>%
ggplot(aes(x = factor(Score), y = sentiment, fill = percentage)) +
geom_tile() +
geom_text(aes(label = paste0(round(percentage), "%")),
color = "white", fontface = "bold") +
scale_fill_viridis(option = "D", direction = -1) +
labs(
title = "Sentiment Distribution Across Rating Spectrum",
subtitle = "Mapping between numerical ratings and sentiment categories",
x = "Rating (1-5 stars)",
y = NULL,
fill = "% of Reviews"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "right",
panel.grid = element_blank(),
axis.text = element_text(size = 12),
plot.title = element_text(face = "bold")
)
# Display visualizations
p